% Script for doing the data preprocessing. We control for outliers here,
% calculate cryptocurrency characteristics and technical indicators. The
% data is resampled to the weekly frequency

% Clear console
clear; clc; close all;

% Set paths
sOldPath        = path;
sInDataPath     = './DATA/';
sOutDataPath    = './DATA/';
addpath('./Utils/');

% Resampling to weekly data as in Liu et al. If set to false, data is
% resampled from sunday to sunday
lResampleLiuEtAl    = true;

% Use artificial data
lUseArtData         = true;

% Load data
if lUseArtData
    % Load artificial data
    fprintf(2,'Loading artificial data \n');
    load([sInDataPath,'b0ArtDataOLHC.mat'])
else
    % Load empirical data
    fprintf(2,'Loading empirical data \n');
    load([sInDataPath,'b0DataOLHC.mat']);
end

% Determine dimensions
[iNumObs, iNumAssets] = size(mClose);

%% Set design choices
% Here we create datasets for various research design choices that affect
% the dataset construction
load('DesignChoices.mat','tDataCombos');
iNumDataCombos = size(tDataCombos,1);

%% Get data
% Loop over combinations
for iIdxC = 1:iNumDataCombos
    % Copy data (allows running the loop in parallel)
    rData           = struct();
    rChars          = struct();
    rData.mReturns  = mReturns;
    rData.mClose    = mClose;
    rData.mVolume   = mVolume;
    rData.mME       = mME;
    rData.yrmoda    = yrmoda;
    rData.mLow      = mLow;
    rData.mHigh     = mHigh;
    rData.mOpen     = mOpen;

    % Determine dimensions
    [iNumObs, iNumAssets] = size(rData.mClose);
    
    % Print progress
    fprintf(2, 'Combo %i of %i \n', iIdxC, iNumDataCombos);

    % Extract setting
    sOutlierTreatment   = tDataCombos.cOutlierTreatment{iIdxC};
    dThreshold          = tDataCombos.vThreshold(iIdxC);

    % Winsorize/truncate returns
    switch upper(sOutlierTreatment)
        case 'WIN'
            % Winsorize returns
            rData.mReturns = fWinsorize(rData.mReturns, dThreshold, 1, 2);
        case 'TRUNC'
            % Trunctate returns
            rData.mReturns = fTruncate(rData.mReturns, dThreshold, 1, 2);
    end
    % Find changed data
    lChanged                = rData.mReturns ~= mReturns;
    rData.mClose(lChanged)  = NaN;

    % Ensure that every observation has price, volume, and market value
    [lIsAvailObs, rData.mME, rData.mVolume, rData.mClose, rData.mLow, rData.mHigh, rData.mOpen] = ...
        fEnsureAllObs(rData.mME, rData.mVolume, rData.mClose, rData.mLow, rData.mHigh, rData.mOpen);

    % Set volume to missing if zero for all assets (affects time series)
    lSetMissing                     = all(isnan(rData.mVolume) | rData.mVolume == 0, 2);
    rData.mVolume(lSetMissing,:)    = NaN;

    % Set cryptos with less than a few cents of trading volume to missing
    rData.mVolume(rData.mVolume <= 1e-6) = NaN;

    % Set observations to missing if trading volume is zero but return is not
    lSetMissing                 = rData.mVolume == 0 & (rData.mReturns ~= 0);
    rData.mVolume(lSetMissing)  = NaN;

    % Calculate value-weighted market return
    mME_L1          = [NaN(1, iNumAssets); rData.mME(1:end-1,:)];
    mWts            = mME_L1 ./ sum(mME_L1,2,'omitmissing');
    rData.vMktRet   = sum(rData.mReturns .* mWts, 2, 'omitnan');
    rData.vMktRet(all(isnan(mWts), 2)) = NaN;

    % Calculate the rolling market volatility
    vMktVol         = NaN(iNumObs, 1);
    iNumInMktVol    = 28;
    for iIdxT = iNumInMktVol:iNumObs
        % Get data
        vIdxInSample    = (iIdxT-iNumInMktVol+1):iIdxT;
        vMktRetTemp     = rData.vMktRet(vIdxInSample);

        % Calculate volatility
        vMktVol(iIdxT)  = std(vMktRetTemp);
    end
    rData.vMktVol       = vMktVol;

    % Set up characteristic calculation
    fprintf('Start calculation of characteristics \n');
    chars = characteristics(rData.mReturns, rData.mME, rData.mVolume, rData.mClose, rData.vMktRet);

    % Calculate significant Liu et al. characteristics
    % === Size strategies
    rChars.mcap     = chars.fMarketCap(true);
    rChars.prc      = chars.fPrice();
    rChars.maxdprc  = chars.fMaxPrc(7);

    % === Momentum strategies
    rChars.ret_1_0 = chars.fMomentum(7, 0);
    rChars.ret_2_0 = chars.fMomentum(14, 0);
    rChars.ret_3_0 = chars.fMomentum(21, 0);
    rChars.ret_4_0 = chars.fMomentum(28, 0);
    rChars.ret_4_1 = chars.fMomentum(28, 7);

    % === Volume strategies (Dollar trading volume * price (Liu
    % replication))
    rChars.prcvol = chars.fPrcVolumeLiu(7, 0, 1, true, true);
    rChars.volscaled    = chars.fPrcVolumeLiu(7, 0, 1, false, true);
    rChars.volscaled    = rChars.volscaled ./ chars.mME;
    [~, rChars.stdprcvol] = chars.fPrcVolumeLiu(7, 0, 5, true, true);

    % === Insignificant strategy returns
    rChars.age          = chars.fAge();
    rChars.ret_8_0      = chars.fMomentum(7 * 8, 0);
    rChars.ret_16_0     = chars.fMomentum(7 * 16, 0);
    rChars.ret_50_0     = chars.fMomentum(7 * 50, 0);
    rChars.ret_100_0    = chars.fMomentum(7 * 100, 0);
    rChars.volume       = chars.fAvgVolume(7, 0, 1, true);
    rChars.illiq        = chars.fAmihud(7, 0, 5);
    [~, rChars.beta, ivol, iskew, mR2] = chars.fAlphaBetaSigma(rData.vMktRet,365,0,250);
    rChars.beta2        = rChars.beta.^2;
    rChars.ivol         = ivol;
    rChars.iskew        = iskew;
    [rChars.retvol, rChars.skew1] = chars.fVolatility(7, 0, 5);
    [~, rChars.skew2]   = chars.fVolatility(365, 0, 250);
    rChars.maxret       = chars.fMaxRet(7, 0, 1);
    mX                  = lagmatrix(rData.vMktRet, [0,1,2]);
    [~,~,~,mR2lag]      = chars.fAlphaBetaSigma(mX, 365, 0, 250);
    rChars.delay        = 1 - (mR2./mR2lag);

    % Calculate BB characteristics
    [rChars.var90d, rChars.es90d]       = chars.fVaR(90, 0, 20, 0.05);

    % Prospect theory value (takes a lot of time)
    rChars.ppt = chars.fProspectTheory(365, 0, 250);

    % Salience Theory value
    rChars.stv = chars.fSalienceTheory(21, 0, 14);
    
    % Calculate the market illiquidity
    rData.vMktIlliq     = sum(rChars.illiq .* mWts, 2, 'omitnan');
    rData.vMktIlliq(all(isnan(mWts), 2)) = 0;
    rData.vMktIlliq(all(isnan(rChars.illiq) | rChars.illiq == 0,2)) = NaN;

    % Set up indicator calculation
    fprintf('Start calculation of technical indicators \n');
    indicators = technical_indicators(rData.mClose, rData.mOpen, rData.mHigh, rData.mLow, rData.mVolume);

    % Calculate RSI
    rChars.rsi = indicators.fRelativeStrengthIndex();

    % Calculate stochastic RSI
    rChars.stochRSI = indicators.fStochasticRSI(14,5);

    % Calculate stochastic oscillators
    [rChars.stochK, rChars.stochD] = indicators.fStochasticOscillator(14, 3, 14);

    % Calculate CCI
    rChars.cci = indicators.fCCI('iMinNumObs',10);

    % Calculate SMAs to price ratios
    vLagLength = [3, 5, 10, 20, 50, 100, 200];
    for iIdxL = 1:length(vLagLength)
        % Get lag
        iWindowSize = vLagLength(iIdxL);
        sWindowSize = num2str(iWindowSize);

        % Calculate signal
        rChars.(['sma_', sWindowSize,'d']) = ...
            indicators.fSimpleMovingAverage(rData.mClose, iWindowSize)./rData.mClose;
    end

    % Calculate MACD and signal line (scaling -> PPO)
    [rChars.macd, macd_signal]  = indicators.fPercentagePriceOscillator();
    rChars.macd_diff_signal     = rChars.macd - macd_signal;

    % Calculate volume SMAs to volume ratios
    vLagLength = [3, 5, 10, 20, 50, 100, 200];
    for iIdxL = 1:length(vLagLength)
        % Get lag
        iWindowSize = vLagLength(iIdxL);
        sWindowSize = num2str(iWindowSize);

        % Calculate signal
        mVolTemp = rData.mVolume;
        mVolTemp(mVolTemp == 0) = NaN;
        rChars.(['volsma_', sWindowSize,'d']) = ...
            indicators.fSimpleMovingAverage(rData.mVolume, iWindowSize)./mVolTemp;
    end

    % Calculate PVO
    [rChars.volmacd, volmacd_signal] = indicators.fPercentageVolumeOscillator();
    rChars.volmacd_diff_signal = rChars.volmacd - volmacd_signal;

    % Calculate Chaikin Money Flow
    rChars.chaikin = indicators.fChaikinMoneyFlow(21, 10);

    % Calculate bollinger bands
    [boll_low, boll_mid, boll_high] = indicators.fBollingerBands();
    rChars.boll_low = boll_low./indicators.mClose;
    rChars.boll_mid = boll_mid./indicators.mClose;
    rChars.boll_high = boll_high./indicators.mClose;
    rChars.boll_width = (boll_high - boll_low)./boll_mid;

    % Return with one day lead: This is required for the implementation
    % lag, which we discuss as a research design choice
    rData.mRetLead = lagmatrix(rData.mReturns, -1);

    %% Resample data
    fprintf('Start resampling to weekly data \n');
    
    % Resample characteristics (use last valid observation)
    cFieldnames  = fieldnames(rChars);
    sMethodChars = 'last'; % Use last nonmissing observation
    for iIdxF = 1:length(cFieldnames)
        % Get characteristic
        mCharTemp = rChars.(cFieldnames{iIdxF});
        if lResampleLiuEtAl
            % This resamples the data as in Liu et al.
            [rChars.(cFieldnames{iIdxF}), vDatesWeek] = ...
                fResampleLiuEtAl(mCharTemp, yrmoda, ...
                'sMethod', sMethodChars, 'iMinNumObs', 1);
        else
            % This resamples the data from sunday to sunday
            rChars.(cFieldnames{iIdxF}) = fResampleDay2Week(mCharTemp, yrmoda, ...
                'sMethod', sMethodChars, 'iMinNumObs',1);
        end
    end

    % Resample market return
    if lResampleLiuEtAl
        rData.vMktRet = fResampleLiuEtAl( 1 + rData.vMktRet, yrmoda,...
            'sMethod','prod', 'iMinNumObs','all');
    else
        rData.vMktRet = fResampleDay2Week( 1 + rData.vMktRet, yrmoda, ...
            'sMethod','prod', 'iMinNumObs',7);
    end
    rData.vMktRet = rData.vMktRet - 1;

    % Resample market illiquidity
    if lResampleLiuEtAl
        rData.vMktIlliq = fResampleLiuEtAl( rData.vMktIlliq, yrmoda,...
            'sMethod','last', 'iMinNumObs',1);
    else
        rData.vMktIlliq = fResampleDay2Week( rData.vMktIlliq, yrmoda, ...
            'sMethod','last','iMinNumObs',1);
    end

    % Resample market volatiliy
    if lResampleLiuEtAl
        rData.vMktVol = fResampleLiuEtAl( rData.vMktVol, yrmoda, ...
            'sMethod', 'last', 'iMinNumObs',1);
    else
        rData.vMktVol = fResampleDay2Week( rData.vMktVol, yrmoda, ...
            'sMethod','last','iMinNumObs',1);
    end

    % Resample returns (use product - REQUIRE ALL OBSERVATIONS)
    if lResampleLiuEtAl
        [rData.mReturns,vDates] = fResampleLiuEtAl( 1 + rData.mReturns, yrmoda,...
            'sMethod','prod', 'iMinNumObs','all');
    else
        [rData.mReturns,vDates] = fResampleDay2Week( 1 + rData.mReturns, yrmoda, ...
            'sMethod','prod', 'iMinNumObs',7);
    end
    rData.mReturns = rData.mReturns-1;

    % Resample leaded return
    if lResampleLiuEtAl
        [rData.mRetLead,vDatesLead] = fResampleLiuEtAl( 1 + rData.mRetLead, yrmoda, ...
            'sMethod','prod', 'iMinNumObs','all');
    else
        [rData.mRetLead,vDatesLead] = fResampleDay2Week( 1 + rData.mRetLead, yrmoda, ...
            'sMethod','prod', 'iMinNumObs',7);
    end
    rData.mRetLead = rData.mRetLead - 1;

    % Resample closing prices (last valid observation)
    if lResampleLiuEtAl
        [rData.mClose, dates] = fResampleLiuEtAl(rData.mClose, rData.yrmoda,...
            'sMethod','last', 'iMinNumObs',1);
    else
        [rData.mClose, dates] = fResampleDay2Week(rData.mClose, rData.yrmoda, ...
            'sMethod','last','iMinNumObs',1);
    end

    % Resample volume (sum over the past week)
    if lResampleLiuEtAl
        rData.mVolume = fResampleLiuEtAl(rData.mVolume, rData.yrmoda, ...
            'sMethod','sum', 'iMinNumObs',1);
    else
        rData.mVolume = fResampleDay2Week(rData.mVolume, rData.yrmoda, ...
            'sMethod','sum','iMinNumObs',1);
    end

    % Resample market capitalization (last valid observation)
    if lResampleLiuEtAl
        [rData.mME,vDatesME] = fResampleLiuEtAl(rData.mME, rData.yrmoda, ...
            'sMethod','last', 'iMinNumObs',1);
    else
        [rData.mME,vDatesME] = fResampleDay2Week(rData.mME, rData.yrmoda, ...
            'sMethod','last','iMinNumObs',1);
    end

    % Change date
    rData.yrmoda = vDates;
        
    % Create filename
    sFilename = sprintf('%sbCharacteristicsOLHC_Combo%i.mat', sOutDataPath, tDataCombos.data_setting(iIdxC));

    % Save 
    rData = rmfield(rData, {'mLow', 'mOpen', 'mHigh'});
    fprintf('Start saving data \n');
    fSaveData(sFilename, rChars, rData, rMetaData);
    fprintf(2, 'Combo %i of %i saved \n', iIdxC, iNumDataCombos);
end

function fSaveData(sFilename, rChars, rData, rMetaData)
    save(sFilename, 'rChars', 'rData', 'rMetaData', '-v7.3');
end

% Restore path
path(sOldPath);